library(phyloseq)
library(ggplot2)
library(dplyr)
library(scales)
library(grid)
library(reshape2)
library(gridExtra)
library(vegan)
library(cowplot)
library(gtable)
library(pander)
library(tidyr)
## objects and functions that will be useful throughout this analysis
# Set the ggplot theme
theme_set(theme_bw())
# Color palette for stations
station_colors = c("red", "#ffa500", "#0080ff")
# Function to order date levels correctly
order_dates_aug11 <- function(df) {
df$Date <- factor(df$Date,
levels = c("6/16","6/30","7/8","7/14","7/21",
"7/29","8/4","8/11","8/18","8/25","9/2","9/8","9/15",
"9/23","9/29","10/6","10/15","10/20","10/27"))
return(df)
}
order_dates <- function(df) {
df$Date <- factor(df$Date,
levels = c("6/16","6/30","7/8","7/14","7/21",
"7/29","8/4", "8/18","8/25","9/2","9/8","9/15",
"9/23","9/29","10/6","10/15","10/20","10/27"))
return(df)
}
named_list <- function(...){
names <- as.list(substitute(list(...)))[-1L]
result <- list(...)
names(result) <- names
result
}
# Source some useful functions for data normalizatio
source("~/git_repos/MicrobeMiseq/R/miseqR.R")
load("~/git_repos/chabs/miseq_may2015/analysis/data/erie-data.RData")
# Inspect erie phyloseq object
erie
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 7192 taxa and 53 samples ]
## sample_data() Sample Data: [ 53 samples by 32 sample variables ]
## tax_table() Taxonomy Table: [ 7192 taxa by 9 taxonomic ranks ]
Scale counts to an even depth by dividing by total reads and multiplying by the minimum library size
## Scale reads in OTU table to even depth
depth = 15000
erie_scale <-
erie %>%
scale_reads(n = depth, round = "round")
Create figure 1, showing pigment concentrations, toxin concentration, and proportion of cyanobacterial reads over time and stations.
# Import metadata file with nutrients, pigments and toxin
nutrient <- read.csv("~/git_repos/chabs/miseq_may2015/analysis/data/nutrient_cleaned.csv")
# Format nutrient data
nutrient_sub <-
nutrient %>%
filter(!(Date %in% c("5/27", "6/10", "11/3"))) %>%
order_dates_aug11()
# Calculate relative abundance of Cyanobacteria at each date
cyano_abundance <-
erie_scale %>%
tax_glom(taxrank = "Phylum") %>% # conglomerate OTUs to phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # transform to relative abundance
subset_taxa(Phylum == "Cyanobacteria") %>% # Subset to just Cyanobacteria
psmelt() %>% # melt phyloseq object
rename(Cyanobacteria = Abundance) %>%
select(Cyanobacteria, Date, Station)
# Merge cyanobacteria data with nutrient df
bloom_df <-
nutrient_sub %>%
left_join(cyano_abundance, by = c("Station", "Date")) %>%
mutate(Phycocyanin = ifelse(Phycocyanin > 80, 80, Phycocyanin)) %>% # lower extreme values to plot better
select(Station, Date, Phycocyanin, Chla, ParMC, Cyanobacteria) %>%
melt(id.vars = c("Station", "Date")) %>%
order_dates_aug11()
# Make a faceted ggplot of the four bloom variables over time and grouped by station
bloom_plots <- ggplot(bloom_df,
aes(x = Date, y = value, group = Station, color = Station, shape = Station)
) +
facet_grid(variable~., scales = "free_y") +
geom_point(size = 1.3) +
geom_line(size = 1) +
ylab("") +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
scale_color_manual(values = station_colors) +
theme(
strip.background = element_blank(),
strip.text = element_text(size = 11),
axis.title.x = element_blank()
)
# function to extract a legend from a ggplot object
grab_legend <- function(a_ggplot) {
tmp <- ggplot_gtable(ggplot_build(a_ggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
legend
}
station_legend <- grab_legend(bloom_plots)
bloom_plots
par(mfrow = c(1,3))
hist(nutrient$Chla)
hist(nutrient$Phycocyanin)
hist(nutrient$ParMC)
These distributions are very non-normal. Let’s try log-scaling
par(mfrow = c(1,3))
hist(log(nutrient$Chla), xlab = "log Chla", main = "")
hist(log(nutrient$Phycocyanin), xlab = "log Phycocyanin", main = "")
hist(log(nutrient$ParMC), xlab = "log ParMC", main = "")
These look better, so we will use these transformed variables for our correlation tests. Since phycocyanin and parmc both have zeroes, we will add a constant to all values. We selected this constant to be small and to minimally impact the distribution shape
par(mfrow = c(1,2))
logChla <- log(nutrient$Chla)
logPhyco <- log(nutrient$Phycocyanin + 0.1)
logParMC <- log(nutrient$ParMC + 0.1)
hist(logPhyco, main = "", xlab = "log Phyco + 0.1")
hist(logParMC, main = "", xlab = "log ParMC + 0.1")
plot(logChla, logPhyco, xlab = "log Chla", ylab = "log Phycocyanin")
# Calculate correlation between Chla and phycocyanin for all sites
cor.test(
x = logChla,
y = logPhyco,
alternative = "two.sided",
method = "pearson"
)
##
## Pearson's product-moment correlation
##
## data: logChla and logPhyco
## t = 10.175, df = 55, p-value = 2.985e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6936187 0.8828031
## sample estimates:
## cor
## 0.8081294
It looks like there is a pretty close correlation between chl a and phycocyanin measurements.
par(mfrow = c(1,2))
plot(logChla, logParMC)
plot(logPhyco, logParMC)
cor.test(
x = logChla,
y = logParMC,
alternative = "two.sided",
method = "pearson"
)
##
## Pearson's product-moment correlation
##
## data: logChla and logParMC
## t = 6.2612, df = 55, p-value = 6.071e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4622297 0.7753393
## sample estimates:
## cor
## 0.6451002
cor.test(
x = logPhyco,
y = logParMC,
alternative = "two.sided",
method = "pearson"
)
##
## Pearson's product-moment correlation
##
## data: logPhyco and logParMC
## t = 11.949, df = 55, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7565981 0.9089839
## sample estimates:
## cor
## 0.8496596
The statistical tests show that there are significant correlations between pigments and toxin, but the plots show that there are several dates in which the toxin levels are 0 but pigments are high, indicating that pigments are not always predictive of toxicity.
# add log scaled measurements to nutrient data frame
In this figure we will display in part A a barplot with the relative abundance of cyanobacteria genera over time and site. In part B we will break up these groups into OTUs and show lineplots for each non-rare OTU over time (mean rel abundance > 0.0001) and site.
## Select only cyano OTUs that have mean relative abundace > 0.0001
n = 15000
thresh = 0.0005
# Calculate mean relative abundance for each OTU
tax_mean <- taxa_sums(erie_scale)/nsamples(erie_scale)
# Prune low abundance taxa using thresh as mean relative abundance
erie_prune_0001 <-
erie_scale %>%
prune_taxa(tax_mean > thresh*n, .)
# Create a melted data frame of selected cyanobacteria OTUs
cyano_otus <-
erie_prune_0001 %>%
transform_sample_counts(function(x) {x/sum(x)}) %>%
subset_taxa(Class == "Cyanobacteria") %>%
psmelt()
################# Plot A #######################
cyano_genus <-
cyano_otus %>%
group_by(Genus) %>%
arrange(Genus) %>%
order_dates()
plot2a <- ggplot(cyano_genus, aes(x = Date, y = Abundance, fill = Genus)) +
facet_grid(~Station) +
geom_bar(stat = "identity") +
ylab("rel. abundance") +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
scale_fill_brewer(palette = "Paired") +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 9),
plot.title = element_text(size = 10, face = "bold"),
strip.background = element_blank()
)
genus_legend <- grab_legend(plot2a)
plot2a <- plot2a + theme(legend.position = "none")
# Function to make a ggplot lineplot of an OTU's relative abundance over time
#
# Args:
# df: a melted data frame generated by calling psmelt() on a phyloseq object.
# Contains an "Abundance" column for the OTU's abundance
# otu: the OTU to generate a lineplot for
# taxrank: the taxonomic rank to appear in the plot title (e.g. "Genus")
# Returns:
# a ggplot lineplot
plot_otus <- function(df, otu, taxrank) {
ggplot(df,
aes(x = Date, y = Abundance, group = Station, color = Station, shape = Station)) +
geom_point(size = 2) +
geom_line(size = 0.7) +
ggtitle(paste(df[1, taxrank], otu)) +
ylab("rel. abund") +
scale_color_manual(values = station_colors) +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 9),
legend.position = "none",
plot.title = element_text(size = 10, face = "bold")
)
}
##################### Plot B #############################
cyano_otu_names <- as.list(levels(cyano_otus$Species))
names(cyano_otu_names) <- levels(cyano_otus$Species)
# Generate a lineplot for each cyanobacteria with mean relative abundance > 0.0001
cyano_otu_plots <- lapply(cyano_otu_names,
function(otu) {
df_otu <- filter(cyano_otus, OTU == otu)
plot <- plot_otus(df = df_otu, otu = otu, taxrank = "Genus")
return(plot)
}
)
plot2b <- arrangeGrob(
cyano_otu_plots$Otu00007, cyano_otu_plots$Otu00177, cyano_otu_plots$Otu00005,
cyano_otu_plots$Otu00044, cyano_otu_plots$Otu00304, cyano_otu_plots$Otu00037,
cyano_otu_plots$Otu00147, cyano_otu_plots$Otu00193, cyano_otu_plots$Otu00049,
ncol = 3, nrow = 3
)
##################### Compile plots #####################################
ggdraw() +
draw_plot(plot2a, x = 0.02, y = 0.72, width = .8, height = 0.28) +
draw_plot(genus_legend, x = 0.84, y = .79, width = .16, height = .18) +
draw_plot(plot2b, x = 0.02, y = 0, width = .8, height = 0.67) +
draw_plot(station_legend, x = 0.82, y = .5, width = .18, height = .18) +
draw_plot_label(c("A", "B"), c(0, 0), c(1, 0.5), size = 14)
# My own subsetting function, similar to phyloseq::subset_taxa, except taxa can
# be passed as arguments within functions without weird environment errors
#
# Args:
# physeq: a phyloseq object
# taxrank: taxonomic rank to filter on
# taxa: a vector of taxa groups to filter on
#
# Returns:
# a phyloseq object subsetted to the x taxa in taxrank
my_subset_taxa <- function(physeq, taxrank, taxa) {
physeq_tax_sub <- tax_table(physeq)[tax_table(physeq)[ , taxrank] %in% taxa, ]
tax_table(physeq) <- physeq_tax_sub
return(physeq)
}
Here we calculate the inverse simpson index for each bacterial group as well as all NcBacteria
mytaxa <- c(
"Actinobacteria", "Alphaproteobacteria",
"Betaproteobacteria", "Bacteroidetes",
"Gammaproteobacteria", "Deltaproteobacteria",
"Verrucomicrobia"
)
names(mytaxa) <- mytaxa
# Taxonomic ranks of mytaxa
mytaxa_taxrank <- c(
"Phylum", "Class", "Class",
"Phylum", "Class", "Class", "Phylum"
)
names(mytaxa_taxrank) <- mytaxa
# For each bacterial group, subset the phyloseq object and calculate the inverse simpson
invsimp_df <- Map(
function(tax, rank) {
# Subset to the taxonomic group
erie_sub <- my_subset_taxa(physeq = erie, taxrank = rank, taxa = tax)
# Calculate inverse simpson
invsimp <- estimate_richness(erie_sub, measures = "InvSimpson")
},
mytaxa, mytaxa_taxrank
)
# Calculate inv simpson of all NcBacteria
ncbacteria <- subset_taxa(erie, Class != "Cyanobacteria")
invsimp_df$NcBacteria <- estimate_richness(ncbacteria, measures = "InvSimpson")
# Merge sample metadata with these estimates
bloom_dat <- data.frame(sample_data(erie)) %>%
select(SampleID, Chla, Station, Date, Phycocyanin, ParMC, pH, TP) %>%
order_dates()
nutrient <- order_dates(nutrient)
ggplot(nutrient, aes(x = Date, y = TP, group = Station, color = Station)) +
geom_point() +
geom_line()
plot(log(bloom_dat$TP), log(bloom_dat$Chla))
plot(log(bloom_dat$TP), log(bloom_dat$Phycocyanin))
plot(log(bloom_dat$TP), bloom_dat$pH)
plot(log(bloom_dat$Chla), bloom_dat$pH)
# Create a df with a "Diversity" column that includes richness and inv. simpson,
# and log-chl a values from erie sample_data
alphadiv <- as.data.frame(invsimp_df)
names(alphadiv) <- names(invsimp_df)
alphadiv$SampleID <- rownames(alphadiv)
alphadiv_df <- alphadiv %>%
melt(id.vars = "SampleID", variable.name = "Group", value.name = "InvSimp") %>%
left_join(sample_data(erie), by = c("SampleID" = "SampleID"))
Explore relationships between alpha-diversity and pH
alphadiv_ph_plots <- lapply(levels(alphadiv_df$Group), function(x) {
df <- filter(alphadiv_df, Group == x)
ggplot(df, aes(x = pH, y = InvSimp)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle(x)
})
do.call(grid.arrange, alphadiv_ph_plots)
# Betaproteo
betas <- filter(alphadiv_df, Group == "Betaproteobacteria")
cor.test(betas$InvSimp, betas$pH)
##
## Pearson's product-moment correlation
##
## data: betas$InvSimp and betas$pH
## t = 4.3644, df = 48, p-value = 6.747e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2990222 0.7065324
## sample estimates:
## cor
## 0.5330066
# Bacteroidetes
bacteroidetes <- filter(alphadiv_df, Group == "Bacteroidetes")
cor.test(bacteroidetes$InvSimp, bacteroidetes$pH)
##
## Pearson's product-moment correlation
##
## data: bacteroidetes$InvSimp and bacteroidetes$pH
## t = 3.3108, df = 48, p-value = 0.001772
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1736684 0.6334917
## sample estimates:
## cor
## 0.4311732
Relationship between alpha-diversity and Chla
alphadiv_chla_plots <- lapply(levels(alphadiv_df$Group), function(x) {
df <- filter(alphadiv_df, Group == x)
ggplot(df, aes(x = log(Chla), y = InvSimp)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle(x)
})
do.call(grid.arrange, alphadiv_chla_plots)
# Betaproteo
betas <- filter(alphadiv_df, Group == "Betaproteobacteria")
cor.test(betas$InvSimp, log(betas$Chla))
##
## Pearson's product-moment correlation
##
## data: betas$InvSimp and log(betas$Chla)
## t = 6.3848, df = 51, p-value = 5.096e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4832813 0.7937984
## sample estimates:
## cor
## 0.6665102
# Bacteroidetes
bacteroidetes <- filter(alphadiv_df, Group == "Bacteroidetes")
cor.test(bacteroidetes$InvSimp,log(bacteroidetes$Chla))
##
## Pearson's product-moment correlation
##
## data: bacteroidetes$InvSimp and log(bacteroidetes$Chla)
## t = 4.9682, df = 51, p-value = 7.985e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3557001 0.7288717
## sample estimates:
## cor
## 0.5710875
Relationship between alpha-diversity and Phycocyanin
alphadiv_phyco_plots <- lapply(levels(alphadiv_df$Group), function(x) {
df <- filter(alphadiv_df, Group == x)
ggplot(df, aes(x = log(Phycocyanin + 0.1), y = InvSimp)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle(x)
})
do.call(grid.arrange, alphadiv_phyco_plots)
# Betaproteo
betas <- filter(alphadiv_df, Group == "Betaproteobacteria")
cor.test(betas$InvSimp, log(betas$Phycocyanin + 0.1))
##
## Pearson's product-moment correlation
##
## data: betas$InvSimp and log(betas$Phycocyanin + 0.1)
## t = 4.6294, df = 51, p-value = 2.556e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3208331 0.7098776
## sample estimates:
## cor
## 0.5439555
# Bacteroidetes
bacteroidetes <- filter(alphadiv_df, Group == "Bacteroidetes")
cor.test(bacteroidetes$InvSimp, log(betas$Phycocyanin + 0.1))
##
## Pearson's product-moment correlation
##
## data: bacteroidetes$InvSimp and log(betas$Phycocyanin + 0.1)
## t = 4.6865, df = 51, p-value = 2.105e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3268212 0.7131804
## sample estimates:
## cor
## 0.5486486
Relationship between alpha-diversity and TP
alphadiv_TP_plots <- lapply(levels(alphadiv_df$Group), function(x) {
df <- filter(alphadiv_df, Group == x)
ggplot(df, aes(x = log(TP), y = InvSimp)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle(x)
})
do.call(grid.arrange, alphadiv_TP_plots)
# Betaproteo
betas <- filter(alphadiv_df, Group == "Betaproteobacteria")
cor.test(betas$InvSimp, log(betas$TP))
##
## Pearson's product-moment correlation
##
## data: betas$InvSimp and log(betas$TP)
## t = 5.7827, df = 51, p-value = 4.482e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4325907 0.7688222
## sample estimates:
## cor
## 0.6293024
# Bacteroidetes
bacteroidetes <- filter(alphadiv_df, Group == "Bacteroidetes")
cor.test(bacteroidetes$InvSimp, log(betas$TP))
##
## Pearson's product-moment correlation
##
## data: bacteroidetes$InvSimp and log(betas$TP)
## t = 5.2149, df = 51, p-value = 3.373e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3800171 0.7417871
## sample estimates:
## cor
## 0.5897355
Relationship between alpha-diversity and Temp
alphadiv_temp_plots <- lapply(levels(alphadiv_df$Group), function(x) {
df <- filter(alphadiv_df, Group == x)
ggplot(df, aes(x = Temp, y = InvSimp)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle(x)
})
do.call(grid.arrange, alphadiv_temp_plots)
# Verrucomicrobia
verr <- filter(alphadiv_df, Group == "Verrucomicrobia")
cor.test(verr$InvSimp, verr$Temp)
##
## Pearson's product-moment correlation
##
## data: verr$InvSimp and verr$Temp
## t = -7.6644, df = 51, p-value = 4.879e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8364992 -0.5750517
## sample estimates:
## cor
## -0.7316266
# Gamma
gamm <- filter(alphadiv_df, Group == "Gammaproteobacteria")
cor.test(gamm$InvSimp, gamm$Temp)
##
## Pearson's product-moment correlation
##
## data: gamm$InvSimp and gamm$Temp
## t = -6.0812, df = 51, p-value = 1.53e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7816459 -0.4583543
## sample estimates:
## cor
## -0.6483267
Seasonal alpha diversity trends
alphadiv_ncbacteria <- filter(alphadiv_df, Group == "NcBacteria") %>%
order_dates()
ggplot(alphadiv_ncbacteria, aes(x = Date, y = InvSimp, group = Station, color = Station, shape = Station)) +
geom_point() +
geom_line() +
scale_color_manual(values = station_colors)
alphadiv_ncbacteria <- filter(alphadiv_df, Group == "Bacteroidetes") %>%
order_dates()
ggplot(alphadiv_ncbacteria, aes(x = Date, y = InvSimp, group = Station, color = Station, shape = Station)) +
geom_point() +
geom_line() +
scale_color_manual(values = station_colors)
alphadiv_ncbacteria <- filter(alphadiv_df, Group == "Betaproteobacteria") %>%
order_dates()
ggplot(alphadiv_ncbacteria, aes(x = Date, y = InvSimp, group = Station, color = Station, shape = Station)) +
geom_point() +
geom_line() +
scale_color_manual(values = station_colors)
alphadiv_ncbacteria <- filter(alphadiv_df, Group == "NcBacteria") %>%
order_dates()
plot3a <- ggplot(alphadiv_ncbacteria, aes(x = Date, y = InvSimp, group = Station, color = Station, shape = Station)) +
geom_point() +
geom_line() +
scale_color_manual(values = station_colors) +
xlab("") +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
)
ggdraw() +
draw_plot(plot3a, x = 0.02, y = .5, width = .98, height = .49) +
draw_plot(alphadiv_chla_plots[[8]], x = 0.02, y = 0, width = .3, height = .48) +
draw_plot(alphadiv_chla_plots[[3]], x = 0.34, y = 0, width = .3, height = .48) +
draw_plot(alphadiv_chla_plots[[4]], x = 0.66, y = 0, width = .3, height = .48) +
draw_plot_label(c("A", "B"), c(0, 0), c(1, 0.5), size = 14)
# Subset to cyanobacteria and scale internally
cyanos <-
erie %>%
subset_taxa(Class == "Cyanobacteria") %>%
scale_reads(round = "round")
# Subset to non-cyanobacteria and scale internally
non_cyanos <-
erie %>%
subset_taxa(Class != "Cyanobacteria") %>%
scale_reads(round = "round")
# Make a list of phyloseq objects for the full community, cyanos, and non-cyanos
physeq_subsets <- list(erie_scale, cyanos, non_cyanos)
names(physeq_subsets) <- c("full", "Cyanobacteria", "NcBacteria")
# Generate pcoa scores for each subset
pcoas <- lapply(physeq_subsets,
function(x) {
ordinate(
physeq = x,
method = "PCoA",
distance = "bray"
)
}
)
# Generate a df to plot pcoa for each subset
pcoa_dfs <- lapply(pcoas,
function(x, names) {
p <- plot_ordination(
physeq = erie_scale,
axes = 1:3,
ordination = x,
justDF = TRUE
)
p$Month <- factor(p$Month,
levels = c("June", "July", "August", "September", "October"))
p <- p %>%
rename(PC1 = Axis.1, PC2 = Axis.2, PC3 = Axis.3) %>%
order_dates()
return(p)
}
)
# Flip orientation of PC2 for Cyanobacteria (does not affect interpretation)
pcoa_dfs$NcBacteria$PC2 <- -pcoa_dfs$NcBacteria$PC2
# Generate relative, lingoes-corrected eigenvalues for PC1 and PC2
eigs <- lapply(pcoas,
function(x) {
pcs <- c(PC1 = signif(x$values$Rel_corr_eig[1]*100, 3),
PC2 = signif(x$values$Rel_corr_eig[2]*100, 3),
PC3 = signif(x$values$Rel_corr_eig[3]*100, 3)
)
return(pcs)
}
)
pcoa_plots <- Map(
function(x, n) {
ggplot(data = x, aes(x = PC1, y = PC2,
color = Station, shape = Station)) +
geom_point(aes(alpha = Month), size = 2.5) +
scale_color_manual(values = station_colors) +
xlab(paste("PC1 ", eigs[[n]][1], "%")) +
ylab(paste("PC2 ", eigs[[n]][2], "%")) +
theme(plot.margin = unit(c(0, 0.2, 0, 0.2), "cm"))
},
pcoa_dfs, names(pcoa_dfs)
)
# Extract legend
pcoa_legend <- grab_legend(pcoa_plots$full)
# Remove legend from plots
pcoa_plots <- lapply(pcoa_plots,
function(x) {x + theme(legend.position = "none")}
)
# Function to create a plot of time (x-axis) vs PC scores (y-axis)
plot_pcts <- function(df, pc, eigs) {
ggplot(df,
aes_string(x = "Date", y = pc, group = "Station", color = "Station", shape = "Station")) +
geom_point(size = 2.5) +
geom_line(size = 1.1) +
scale_color_manual(values = station_colors) +
scale_x_discrete(
breaks = c("7/8", "8/4", "9/2", "10/6"),
labels = c("Jul", "Aug", "Sep", "Oct"),
drop = FALSE
) +
ylab(paste(pc, " ", eigs[pc], "%")) +
theme(
axis.title.x = element_blank(),
plot.title = element_text(face = "bold", size = 16),
legend.position = "none",
plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "cm")
)
}
# Non-cyano PC time series plots
non_cyano_pcs <- named_list("PC1", "PC2", "PC3")
non_cyano_pc_plots <- lapply(non_cyano_pcs,
function(x) {
plot_pcts(pcoa_dfs$NcBacteria, x, eigs = eigs$NcBacteria)
}
)
# Plot for pH
ph_plot <- ggplot(pcoa_dfs$NcBacteria, aes(x = Date, y = pH, group = Station, shape = Station, color = Station)) +
geom_line()
# Plot for Temperature
temp_plot <- ggplot(pcoa_dfs$NcBacteria, aes(x = Date, y = Temp, group = Station, shape = Station, color = Station)) +
geom_line()
# Plot for pH
tp_plot <- ggplot(pcoa_dfs$NcBacteria, aes(x = Date, y = TP, group = Station, shape = Station, color = Station)) +
geom_line()
plot(pcoa_dfs$NcBacteria$PC1, log(pcoa_dfs$NcBacteria$TP))
plot(pcoa_dfs$NcBacteria$PC1, pcoa_dfs$NcBacteria$pH)
plot(pcoa_dfs$NcBacteria$PC2, pcoa_dfs$NcBacteria$Temp)
cor.test(pcoa_dfs$NcBacteria$PC1, log(pcoa_dfs$NcBacteria$TP), method = "pearson")
##
## Pearson's product-moment correlation
##
## data: pcoa_dfs$NcBacteria$PC1 and log(pcoa_dfs$NcBacteria$TP)
## t = 6.784, df = 51, p-value = 1.194e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5141512 0.8085122
## sample estimates:
## cor
## 0.6887308
cor.test(pcoa_dfs$NcBacteria$PC1, pcoa_dfs$NcBacteria$pH, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: pcoa_dfs$NcBacteria$PC1 and pcoa_dfs$NcBacteria$pH
## t = 5.6315, df = 48, p-value = 9.118e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4274487 0.7733266
## sample estimates:
## cor
## 0.6307502
ggdraw() +
## non-cyano
draw_plot(non_cyano_pc_plots$PC1, x = 0.02, y = 0.5, width = 0.3, height = 0.42) +
draw_plot(non_cyano_pc_plots$PC2, x = 0.34, y = 0.5, width = 0.3, height = 0.42) +
draw_plot(non_cyano_pc_plots$PC3, x = 0.66, y = 0.5, width = 0.3, height = 0.42)
how good is my PCoA in two dimensions?
pcoa_dist <- as.vector(dist(pcoas$NcBacteria$vectors[,1:2]))
bray_dist <- as.vector(phyloseq::distance(non_cyanos, method = "bray"))
plot(bray_dist, pcoa_dist) +
abline(lm(pcoa_dist~bray_dist), col = "blue")
## numeric(0)
summary(lm(pcoa_dist~bray_dist))
##
## Call:
## lm(formula = pcoa_dist ~ bray_dist)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.280556 -0.034510 0.008791 0.045607 0.133647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.284515 0.006871 -41.41 <2e-16 ***
## bray_dist 1.249986 0.013976 89.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06363 on 1376 degrees of freedom
## Multiple R-squared: 0.8532, Adjusted R-squared: 0.8531
## F-statistic: 7999 on 1 and 1376 DF, p-value: < 2.2e-16
ordi <- ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$pH)
ordi_grid <- ordi$grid
ordi_ph <- expand.grid(x = ordi_grid$x, y = ordi_grid$y) #get x and ys
ordi_ph$z <- as.vector(ordi_grid$z) #unravel the matrix for the z scores
ordi_ph <- data.frame(na.omit(ordi_ph)) #gets rid of the nas
ggplot(pcoa_dfs$NcBacteria, aes(x = PC1, y = PC2)) +
geom_point() +
stat_contour(data = ordi_ph, aes(x = x, y = y, z = z, binwidth = 4))
http://www.fromthebottomoftheheap.net/2011/06/10/what-is-ordisurf-doing/
n <- pcoas$NcB
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$pH, main = "pH", method = "ML", select = TRUE)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
##
## Estimated degrees of freedom:
## 6.54 total = 7.54
##
## ML score: -7.987996
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$Temp, main = "Temp")
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
##
## Estimated degrees of freedom:
## 7.14 total = 8.14
##
## REML score: 102.094
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$TP, main = "TP")
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
##
## Estimated degrees of freedom:
## 3.64 total = 4.64
##
## REML score: 232.063
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$Chla, main = "Chla")
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
##
## Estimated degrees of freedom:
## 4.92 total = 5.92
##
## REML score: 202.2073
ordisurf(pcoas$NcBacteria$vectors ~ sample_data(non_cyanos)$Turbidity, main = "Turbidity", method = "ML", select = TRUE)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x1, x2, k = 10, bs = "tp", fx = FALSE)
##
## Estimated degrees of freedom:
## 1.92 total = 2.92
##
## ML score: 170.5032
library(leaps)
# Function to extract the best subset multiple linear regression model
#
# Args:
# vars: vectors of all variables to consider in the model
# response: response variable of the model
# dat: dataframe with vars and response
#
# Returns: a list with the variables in the best model, bic, cp, and adjusted r2
get_bestsub_summary <- function(vars, response, dat) {
formula = reformulate(termlabels = vars, response = response)
lm_model <- regsubsets(formula, method = "forward", dat)
bic <- summary(lm_model)$bic
cp <- summary(lm_model)$cp
adjr2 <- summary(lm_model)$adjr2
best_model <- summary(lm_model)$which[which.min(bic), ]
return(list(model = best_model, bic = bic, cp = cp, adjr2 = adjr2))
}
# Variables to include in cyano models
cyano_vars <- c("Nitrate", "SRP", "Temp", "H2O2", "SpCond", "Ammonia", "Turbidity", "Days", "TP")
# Variables to include in nc-bacteria models
non_cyano_vars <- c(cyano_vars, "pH", "ParMC", "Chla")
# Impute SpCond values for nearshore 1 on Sep 2 and Sep 8 with value for nearshore 2
pcoa_dfs_impute <- lapply(pcoa_dfs,
function(x) {
# Change 9/2 value
x$SpCond[x$Date == "9/2" & x$Station == "nearshore1"] <-
x$SpCond[x$Date == "9/2" & x$Station == "nearshore2"]
# Change 9/8 value
x$SpCond[x$Date == "9/8" & x$Station == "nearshore1"] <-
x$SpCond[x$Date == "9/8" & x$Station == "nearshore2"]
return(x)
}
)
# get the variables best subset model for the non-cyano community and then
# fit the model to extract coefficients and p-values
non_cyano_models <- lapply(non_cyano_pcs,
function(x) {
best_model <- get_bestsub_summary(non_cyano_vars, x, dat = pcoa_dfs_impute$NcBacteria)
model <- lm(
formula = reformulate(non_cyano_vars[best_model$model[-1]], x),
data = pcoa_dfs_impute$NcBacteria
)
return(model)
}
)
set.seed(1)
boot_models <- list()
for (i in 1:100) {
boot_sample <- sample(pcoa_dfs$NcBacteria$SampleID, replace = TRUE)
newdf <- pcoa_dfs_impute$NcBacteria[boot_sample, ]
best_model <- get_bestsub_summary(non_cyano_vars, "PC1", dat = newdf)
model <- lm(
formula = reformulate(non_cyano_vars[best_model$model[-1]], "PC1"),
data = newdf
)
boot_models[[i]] <- model
}
How many models had pH in them, and how frequently was it the most significant term?
n = 15000
thresh = 0.0001
# Calculate mean relative abundance for each OTU
tax_mean <- taxa_sums(non_cyanos)/nsamples(non_cyanos)
# Prune low abundance taxa using thresh as mean relative abundance
nc_prune <-
non_cyanos %>%
prune_taxa(tax_mean > thresh*n, .)
Species <- tax_table(nc_prune)[,"Species"]
cors <- list()
for (species in Species) {
otu_abun <- as.numeric(otu_table(nc_prune)[species, ])
cors[[species]] <- cor.test(otu_abun, pcoa_dfs$NcBacteria$PC1, method = "pearson")
}
p_values <- lapply(cors, function(x) {x$p.value})
w <- which(unlist(p_values) < (0.01/385))
r_values <- lapply(cors, function(x) {
y <- x$estimate
names(y) <- NULL
return(y)
})
pos <- which(unlist(r_values) > 0)
neg <- which(unlist(r_values) < 0)
pc1_pos_taxa <- intersect(names(w), names(pos))
pc1_neg_taxa <- intersect(names(w), names(neg))
pc1_postax_table <- tax_table(erie)[pc1_pos_taxa,]
pc1_negtax_table <- tax_table(erie)[pc1_neg_taxa,]
p1 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00002",]), xlab = "PC1 score", ylab = "Otu00002 LHab A1", main = paste("r:", round(cors$Otu00002$estimate,2)))
p2 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00004",]),xlab = "PC1 score", ylab = "Otu00004 acI-B", main = paste("r:", round(cors$Otu00004$estimate,2)))
p3 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00006",]),xlab = "PC1 score", ylab = "Otu00006 acI-C", main = paste("r:", round(cors$Otu00006$estimate,2)))
p4 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00011",]),xlab = "PC1 score", ylab = "Otu00011 acI-A", main = paste("r:", round(cors$Otu00011$estimate,2)))
p5 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00075",]),
xlab = "PC1 score", ylab = "Otu00075 alfII-A", main = paste("r:", round(cors$Otu00075$estimate,2)))
p6 <- plot(pcoa_dfs$NcBacteria$PC1, as.numeric(otu_table(nc_prune)["Otu00073",]),xlab = "PC1 score", ylab = "Otu00073 Planctomyces", main = paste("r:", round(cors$Otu00073$estimate,2)))
par(mfrow = c(2,3))